from langchain.document_loaders import PyPDFLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

# 加载PDF
loader = PyPDFLoader('E:\\project\\ai\\zhihu_llm\\RAG\\rzf.pdf')

pages = loader.load_and_split()

print(pages[0])

"""
Text Splitter  文本切分

chunk_size =100: 指定每个分割块（chunk）的大小为500个字符。

chunk_overlap = 20: 指定每个分割块之间的重叠部分为50个字符。这意味着相邻的两个分割块会有50个字符的重叠。

length_function = len: 指定用于计算文本长度的函数为len，即按照字符的数量进行计算。

add_start_index = True: 设置为True，表示在每个分割块的开头添加起始索引。
"""
text_splitter = RecursiveCharacterTextSplitter(
  chunk_size = 500,
  chunk_overlap  = 50,
  length_function = len, # 按字符串长度进行计算
  add_start_index = True,)
paragraphs = text_splitter.create_documents(pages[0].page_content)

# for paragraph in paragraphs:
#   print(paragraph.page_content)
#   print('------')
